import numpy as np #linear algebra
import pandas as pd #data manipulation and analysis
from scipy.spatial import ConvexHull, convex_hull_plot_2d
import matplotlib.path as mpath
import plotly.express as px
import plotly.graph_objects as go
import textwrap
from urllib.request import urlopen
import json
# read-in dataframe with cluster ids
cluster2_df = pd.read_csv("popClusterData.csv", index_col=0)
cluster2_df.head()
| Population EstimatesJuly 1 2021() | Persons under 5 years | Persons under 18 years | Persons 65 years and over | Female persons | White alone | Black or African American alone | American Indian and Alaska Native alone | Asian alone | Native Hawaiian and Other Pacific Islander alone | Two or More Races | Hispanic or Latino | White alonenot Hispanic or Latino | Veterans2017-2021 | Foreign born persons2017-2021 | cluster_id | Banned or not | County Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 27542 | 1707.60 | 6665.16 | 5095.27 | 13798.54 | 26660.66 | 165.25 | 165.25 | 82.63 | 0.00 | 440.67 | 302.96 | 26385.24 | 1840 | 192.79 | 0 | 0.0 | Adams County, Ohio |
| 1 | 101670 | 6100.20 | 23587.44 | 18503.94 | 50021.64 | 84081.09 | 12912.09 | 305.01 | 915.03 | 0.00 | 3456.78 | 3761.79 | 81132.66 | 6098 | 1830.06 | 0 | 1.0 | Allen County, Ohio |
| 2 | 52316 | 2929.70 | 11666.47 | 10201.62 | 26524.21 | 50484.94 | 470.84 | 156.95 | 418.53 | 52.32 | 784.74 | 837.06 | 49752.52 | 3076 | 784.74 | 0 | 0.0 | Ashland County, Ohio |
| 3 | 97337 | 5450.87 | 21414.14 | 19467.40 | 47889.80 | 90231.40 | 3796.14 | 389.35 | 486.68 | 97.34 | 2433.43 | 4672.18 | 86240.58 | 7158 | 1557.39 | 0 | 0.0 | Ashtabula County, Ohio |
| 4 | 62056 | 2296.07 | 8998.12 | 8874.01 | 31090.06 | 56657.13 | 1799.62 | 248.22 | 1737.57 | 62.06 | 1613.46 | 1241.12 | 55726.29 | 3255 | 2544.30 | 0 | 0.0 | Athens County, Ohio |
from functionsAll import split_dataframe_by_cluster, get_cluster_coords_dict,coords, clusterk_dict,\
cluster0, cluster1, cluster2
# 1
df_list = split_dataframe_by_cluster(cluster2_df, 'cluster_id')
# 2
coords_list = coords(df_list, cluster2_df)
# 3
clusterK_dict = clusterk_dict(df_list, coords_list)
for i in range(len(df_list)):
if len(df_list) ==1:
cluster0_dict = cluster0(clusterK_dict)
elif len(df_list) ==2:
cluster0_dict = cluster0(clusterK_dict)
cluster1_dict = cluster1(clusterK_dict)
elif len(df_list) ==3:
cluster0_dict = cluster0(clusterK_dict)
cluster1_dict = cluster1(clusterK_dict)
cluster2_dict = cluster2(clusterK_dict)
print(f"cluster{i}_dict have been made to dictionary")
cluster0_dict have been made to dictionary cluster1_dict have been made to dictionary cluster2_dict have been made to dictionary
from functionsBanned import filter_banned_counties, get_banned_cluster_coords_dict, bannedCoords, clusterk_dict_banned,\
cluster0Banned, cluster1Banned, cluster2Banned
# 1
banned_counties_df = filter_banned_counties(cluster2_df)
# 2
bannedCoords_list = bannedCoords(banned_counties_df, cluster2_df)
# 3
clusterKBanned_dict = clusterk_dict_banned(banned_counties_df, bannedCoords_list)
Cluster0 had enough banned counties to find non-banned counties in the banned counties convex hull.
from countyName import banned_counties_list, countyNames_cluster0, countyNames_cluster1, countyNames_cluster2, merge_dicts
bannedCountiesList=banned_counties_list(cluster2_df)
# Make into function
for i in range(len(banned_counties_df)):
if len(banned_counties_df) ==1:
cluster0_banned_dict = cluster0Banned(clusterKBanned_dict)
allCounties=countyNames_cluster0(cluster2_df, cluster0_dict, cluster0_banned_dict, bannedCountiesList)
elif len(banned_counties_df) ==2:
cluster0_banned_dict = cluster0Banned(clusterKBanned_dict)
cluster1_banned_dict = cluster1Banned(clusterKBanned_dict)
countyName0=countyNames_cluster0(cluster2_df, cluster0_dict, cluster0_banned_dict, bannedCountiesList)
countyName1=countyNames_cluster1(cluster2_df, cluster1_dict, cluster1_banned_dict, bannedCountiesList)
allCounties = merge_dicts(countyName0, countyName1)
elif len(banned_counties_df) ==3:
cluster0_banned_dict = cluster0Banned(clusterKBanned_dict)
cluster1_banned_dict = cluster1Banned(clusterKBanned_dict)
cluster2_banned_dict = cluster2Banned(clusterKBanned_dict)
countyName0=countyNames_cluster0(cluster2_df, cluster0_dict, cluster0_banned_dict, bannedCountiesList)
countyName1=countyNames_cluster1(cluster2_df, cluster1_dict, cluster1_banned_dict, bannedCountiesList)
countyName2=countyNames_cluster0(cluster2_df, cluster2_dict, cluster2_banned_dict, bannedCountiesList)
print(f"cluster{i}_banned_dict have been made to dictionary")
cluster0_banned_dict have been made to dictionary
Anything above here is good
from topVariableFunctions import filtered_var_pairs, categoryCountyList, subcategoryCountyList, commonKeys, freq_var
filt = filtered_var_pairs(allCounties)
categorycountyList = categoryCountyList(filt)
subcategorycountyList = subcategoryCountyList(filt)
combinedVars = commonKeys(categorycountyList, subcategorycountyList)
The category with the most counties is 'Black or African American alone' with 28 counties. The subcategory with the most counties is 'Two or More Races' with 28 counties.
After all variables are combined, we can now take a look at some of the counties that may be at risk the most common variable amoung them
sorted_var_dict = dict(sorted(combinedVars.items(), key=lambda x: len(x[1]), reverse=True))
# Make ranking table of vars.
ranked_listTbl = [(i+1, item[0], len(item[1]), ", ".join([textwrap.fill(', '.join(textwrap.wrap(c, width=10)), width=80) for c in item[1]])) for i, item in enumerate(sorted_var_dict.items())]
topVarTbl = pd.DataFrame(ranked_listTbl)
topVarTbl.columns=['Rank', 'Demographic_variable', 'Number_of_counties', 'List_of_Counties']
# try to chnage lengh of cloumn
headerColor = 'grey'
rowEvenColor = 'lightgrey'
rowOddColor = 'white'
fig = go.Figure(data=[go.Table(
columnwidth = [50,110,75,700],
header=dict(
values=['Rank', 'Demographic Variable', 'Number of Counties', 'List of Counties'],
line_color='darkslategray',
fill_color=headerColor,
align=['left','center'],
font=dict(color='white', size=12)
),
cells=dict(
values=[topVarTbl.Rank, topVarTbl.Demographic_variable, topVarTbl.Number_of_counties, topVarTbl.List_of_Counties],
line_color='darkslategray',
# 2-D list of colors for alternating rows
fill_color = [[rowOddColor,rowEvenColor,rowOddColor, rowEvenColor,rowOddColor]*5],
align = ['center', 'center', 'center', 'left'],
font = dict(color = 'darkslategray', size = 11)
))
])
fig.show()
Checking to see which counties had the most demographic variables. Then showing a table of what they are
# # Make ranking table of vars.
ranked_list = [(i+1, item[0], len(item[1]), item[1]) for i, item in enumerate(sorted_var_dict.items())]
newdf2 = pd.DataFrame(ranked_list)
newdf2.columns=['Rank', 'Demographic_variable', 'Number_of_counties', 'List_of_Counties']
# Separate the list of counties into individual rows
newdf2 = newdf2.explode('List_of_Counties')
# Drop the duplicate columns
newdf2 = newdf2.drop_duplicates(subset=['Rank', 'List_of_Counties'])
# Rename the columns for clarity
newdf3 = newdf2.rename(columns={
'Demographic_variable': 'Demographic_Variable',
'Number_of_counties': 'Number_of_Counties',
'List_of_Counties': 'County'
})
# Display the updated data frame
newdf3.head()
# Melting OG data:
melted_df = pd.melt(cluster2_df, id_vars=['County Name'], var_name='Attribute', value_name='Value')
melted_df['County Name'] = melted_df['County Name'].apply(lambda x: x[:-13])
melted_df
tidy_df = pd.merge(newdf3, melted_df[['County Name', 'Attribute', 'Value']], left_on=['County','Demographic_Variable'],
right_on=['County Name', 'Attribute'])
tidy_df.drop(['Attribute','County Name'], axis=1, inplace=True)
tidy_df.head()
# create bar chart trace
fig = px.bar(tidy_df, x=tidy_df.Demographic_Variable, y=tidy_df.Number_of_Counties, text='Value', color='County',
labels={"Attribute": "Attribute",
"Value": "Attribute Value"
},
title="Default behavior: some text is tiny",)
fig.update_traces(textposition='inside')
# update the layout to adjust the size of the plot
fig.update_layout(
width=1200, # set the width of the plot to 800 pixels
height=1400, # set the height of the plot to 600 pixels
)
# display the plot
fig.show()
freq_var =freq_var(ranked_list)
# var_freq to dictionary
var_freq_dict = {item[0]: item[1] for i, item in enumerate(freq_var)}
# sorting var_freq_dict dictionary
sorted_var_freq_dict = dict(sorted(var_freq_dict.items(), key=lambda x: len(x[1]), reverse=True))
# Make ranking table of vars.
ranked_var_freqViz = [(i+1, item[0], len(item[1]), ", ".join([textwrap.fill(' '.join(textwrap.wrap(c, width=10)), width=80) for c in item[1]])) for i, item in enumerate(sorted_var_freq_dict.items())]
top_var_freqdfViz = pd.DataFrame(ranked_var_freqViz)
top_var_freqdfViz.columns=['Rank', 'County', 'Number_of_variables', 'List_of_variables']
headerColor = 'grey'
rowEvenColor = 'lightgrey'
rowOddColor = 'white'
fig = go.Figure(data=[go.Table(
columnwidth = [40,70,100,600],
header=dict(
values=['Rank', 'County Name', 'Number of Demographic Variables', 'List of Demographics'],
line_color='darkslategray',
fill_color=headerColor,
align=['center','center', 'center', 'left'],
font=dict(color='white', size=12)
),
cells=dict(
values=[top_var_freqdfViz.Rank, top_var_freqdfViz.County, top_var_freqdfViz.Number_of_variables,
top_var_freqdfViz.List_of_variables],
line_color='darkslategray',
# 2-D list of colors for alternating rows
fill_color = [[rowOddColor,rowEvenColor,rowOddColor, rowEvenColor,rowOddColor]*5],
align = ['center', 'center', 'center', 'left'],
font = dict(color = 'darkslategray', size = 11)
))
])
fig.show()
# # Make ranking table of vars.
ranked_vars_list = [(i+1, item[0], len(item[1]), item[1]) for i, item in enumerate(sorted_var_freq_dict.items())]
top_var_freqdf = pd.DataFrame(ranked_vars_list)
top_var_freqdf.columns=['Rank', 'County', 'Number_of_variables', 'List_of_variables']
# Separate the list of counties into individual rows
test2 = top_var_freqdf.explode('List_of_variables')
# Drop the duplicate columns
test2 = test2.drop_duplicates(subset=['Rank', 'List_of_variables'])
# Rename the columns for clarity
test5 = test2.rename(columns={
'List_of_variables': 'Demographic_Variable',
'Number_of_variables': 'Number_of_variables',
'County': 'County'
})
# Display the updated data frame
# test5.head()
tidy_df2 = pd.merge(test5, melted_df[['County Name', 'Attribute', 'Value']], left_on=['County','Demographic_Variable'],
right_on=['County Name', 'Attribute'])
# create bar chart trace
fig = px.bar(tidy_df2, x=tidy_df2.County, color='Demographic_Variable',text='Value',
labels={"Attribute": "Attribute",
"Value": "Attribute Value",
"Demographic_Variable":"Demographic Var."
},
title="Default behavior: some text is tiny",)
fig.update_traces(textposition='inside')
# update the layout to adjust the size of the plot
fig.update_layout(
width=1500, # set the width of the plot to 800 pixels
height=500, # set the height of the plot to 600 pixels
)
# display the plot
fig.show()
## Most common variables
res = sum(sorted_var_freq_dict.values(), [])
mostCommonVar = list(set(res))
mostCommonVar
mostCommonCounties = top_var_freqdfViz['County'].tolist()
print(mostCommonCounties)
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/minoritymajority.csv',
dtype={"fips": str})
# Filter the data frame to include only Ohio
df_ohio = df[df['STNAME'] == 'Ohio']
# Filter the counties GeoJSON file to include only Ohio counties
counties_ohio = {'type': 'FeatureCollection', 'features': []}
for feature in counties['features']:
if feature['id'][:2] == '39':
counties_ohio['features'].append(feature)
# Make new data frame with the most common vars
cluster2_df['TempCounty Name'] = cluster2_df['County Name'].apply(lambda x: x[:-6])
# merge the two data frames based on the 'County Name' and 'CTYNAME' columns
merged_df = pd.merge(cluster2_df, df_ohio[['CTYNAME', 'FIPS']], left_on='TempCounty Name', right_on='CTYNAME')
# drop the duplicate 'CTYNAME' column
merged_df.drop('CTYNAME', axis=1, inplace=True)
merged_df.drop('TempCounty Name', axis=1, inplace=True)
# print the merged data frame
merged_df.head()
merged_df['TempCounty Name'] = merged_df['County Name'].apply(lambda x: x[:-13])
# Example list of counties to check
counties_to_check = merged_df['TempCounty Name'].tolist()
# Create a list to hold safe counties
safe_counties = []
# Check each county and append to the safe_counties list if not in either of the two lists
for county in counties_to_check:
if county not in mostCommonCounties and county not in bannedCountiesList:
safe_counties.append(county)
# Print the list of safe counties
print(safe_counties)
print(len(safe_counties))
merged_df['risk'] = merged_df['TempCounty Name'].apply(lambda x: 'At risk' if x in mostCommonCounties else
'Already has Banned' if x in bannedCountiesList else
'Safe' if x in safe_counties else merged_df[merged_df['TempCounty Name'] == x]['risk'].values[0])
merged_df.head()
fig = px.choropleth(merged_df, geojson=counties_ohio, locations='FIPS', color='risk', text='
color_continuous_scale="Viridis",
range_color=(0, 2),
scope="usa",
hover_data=["County Name"],
labels={'risk':'Risk Level'})
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
# # Create an empty list to store the filtered key-value pairs
filtered_pairs = []
# Loop through each key-value pair in the dictionary
for key, value in countyName0.items():
category, subcategory = key
# Otherwise, add the key-value pair to the filtered list
filtered_pairs.append({'Category':category, 'Subcategory':subcategory, 'Number of Counties': len(value)})
# Convert the filtered list to a DataFrame
df = pd.DataFrame(filtered_pairs)
# Print the DataFrame
dfSorted = df.sort_values(by='Number of Counties', ascending=False)
dfSorted.head()
## Plot
plt.figure(figsize=(10, 6))
# Create the bar plot
plt.bar(dfSorted['Category'], dfSorted['Number of Counties'])
# Rotate the x-axis labels to make them easier to read
plt.xticks(rotation=90)
# Set the title and axis labels
plt.title('Frequency of Housing Data Categories', fontsize=14)
plt.xlabel('Category', fontsize=12)
plt.ylabel('Count', fontsize=12)
# Show the plot
plt.show()
plt.figure(figsize=(10, 6))
# Create the bar plot
plt.bar(dfSorted['Subcategory'], dfSorted['Number of Counties'])
# Rotate the x-axis labels to make them easier to read
plt.xticks(rotation=90)
# Set the title and axis labels
plt.title('Frequency of Housing Data Subcategories', fontsize=14)
plt.xlabel('Subategory', fontsize=12)
plt.ylabel('Count', fontsize=12)
# Show the plot
plt.show()
# Step 2: Calculate frequency of each county across all tuple keys KEEP
county_freq = {}
for counties in countyName0.values():
for county in counties:
if county in county_freq:
county_freq[county] += 1
else:
county_freq[county] = 1
# county_freq
# Step 4: Rank the counties based on frequency counts KEEP
ranked_counties = sorted(county_freq, key=lambda county: county_freq[county], reverse=True)
import plotly.graph_objs as go
# create bar chart trace
trace = go.Bar(
x=list(county_freq.keys()),
y=list(county_freq.values())
)
# create layout
layout = go.Layout(
title='Frequency of Counties that show up in the convex hull of banned counties',
xaxis=dict(
title='County',
tickangle=90,
tickfont=dict(size=10)
),
yaxis=dict(
title='Frequency'
),
height=600,
width=1000
)
# create figure
fig = go.Figure(data=[trace], layout=layout)
# show plot
fig.show()
values = set(val for lst in countyName0.values() for val in lst)
unique_counties = list(values)
print(unique_counties)
# Create an empty dictionary to store the number of counties for each column pair
counties= []
# Loop through each key-value pair in the dictionary
for key, value in countyName0.items():
category, subcategory = key
counties.append(category)
counties.append(subcategory)
most_common_keys = []
for elem in counties:
if elem not in most_common_keys:
most_common_keys.append(elem)
most_common_keys
Want to find the counties that pop up the most along with there respected variables
# goodish: Need to remove all duplates demographic variables from dictionary values
county_demographics = {}
for demographic_stats, county_list in countyName0.items():
demographic_variable1, demographic_variable2 = demographic_stats # unpack the two variables from the demographic stats pair
for county in county_list:
if county not in county_demographics:
county_demographics[county] = [demographic_variable1]
else:
if demographic_variable1 not in county_demographics[county]:
county_demographics[county].append(demographic_variable1)
if demographic_variable2 not in county_demographics[county]:
county_demographics[county].append(demographic_variable2)
# county_demographics
# goodish: need to remove duplates from dictionary values
shared_vars = {}
for county, vars in county_demographics.items():
shared_vars[county] = [var for var in vars if any(var in vars2 for county2, vars2 in county_demographics.items() if county2 != county)]
# shared_vars
# Create an empty dictionary to store the updated values
updated_dict = {}
# Iterate through each key-value pair in the original dictionary
for key, value in shared_vars.items():
# Use a set to remove duplicates from the value list
unique_values = list(set(value))
# Assign the unique values to the corresponding key in the updated dictionary
updated_dict[key] = unique_values
# Print the updated dictionary
# updated_dict
def are_dicts_equal(dict1, dict2):
if len(dict1) != len(dict2):
return False
dict1_sorted = dict(sorted(dict1.items()))
dict2_sorted = dict(sorted(dict2.items()))
return dict1_sorted == dict2_sorted
are_dicts_equal(shared_vars,county_demographics)
def are_dicts_equal(dict1, dict2, dict3):
if len(dict1) != len(dict2) or len(dict1) != len(dict3):
return False
dict1_sorted = dict(sorted(dict1.items()))
dict2_sorted = dict(sorted(dict2.items()))
dict3_sorted = dict(sorted(dict3.items()))
return dict1_sorted == dict2_sorted == dict3_sorted
are_dicts_equal(county_demographics, shared_vars, updated_dict)